{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd \n",
    "import matplotlib.pyplot as plt \n",
    "from collections import Counter\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.linear_model import RidgeClassifier\n",
    "from sklearn.metrics import f1_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv(r'C:\\Users\\Rookie\\Desktop\\nlp\\train_set.csv',sep='\\t')\n",
    "x_train = train_df['text']\n",
    "y_train = train_df['label']\n",
    "\n",
    "\n",
    "vectorizer = TfidfVectorizer(ngram_range=(1,3),max_features=3000)\n",
    "\n",
    "x_train = vectorizer.fit_transform(x_train)  # 转换成tfidf\n",
    "x_train[:50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5399 3117 1070 4321 4568 2621 5466 3772 4516 2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2491 4109 1757 7539 648 3695 3038 4490 23 7019...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2673 5076 6835 2835 5948 5677 3247 4124 2465 5...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4562 4893 2210 4761 3659 1324 2595 5949 4583 2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4269 7134 2614 1724 4464 1324 3370 3370 2106 2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49995</th>\n",
       "      <td>3725 4498 2282 1647 6293 4245 4498 3615 1141 2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49996</th>\n",
       "      <td>4811 465 3800 1394 3038 2376 2327 5165 3070 57...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49997</th>\n",
       "      <td>5338 1952 3117 4109 299 6656 6654 3792 6831 21...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49998</th>\n",
       "      <td>893 3469 5775 584 2490 4223 6569 6663 2124 168...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49999</th>\n",
       "      <td>2400 4409 4412 2210 5122 4464 7186 2465 1327 9...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>50000 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    text\n",
       "0      5399 3117 1070 4321 4568 2621 5466 3772 4516 2...\n",
       "1      2491 4109 1757 7539 648 3695 3038 4490 23 7019...\n",
       "2      2673 5076 6835 2835 5948 5677 3247 4124 2465 5...\n",
       "3      4562 4893 2210 4761 3659 1324 2595 5949 4583 2...\n",
       "4      4269 7134 2614 1724 4464 1324 3370 3370 2106 2...\n",
       "...                                                  ...\n",
       "49995  3725 4498 2282 1647 6293 4245 4498 3615 1141 2...\n",
       "49996  4811 465 3800 1394 3038 2376 2327 5165 3070 57...\n",
       "49997  5338 1952 3117 4109 299 6656 6654 3792 6831 21...\n",
       "49998  893 3469 5775 584 2490 4223 6569 6663 2124 168...\n",
       "49999  2400 4409 4412 2210 5122 4464 7186 2465 1327 9...\n",
       "\n",
       "[50000 rows x 1 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text_df = pd.read_csv(r'C:\\Users\\Rookie\\Desktop\\nlp\\test_a.csv', sep='\\t')\n",
    "text_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8745508835598\n"
     ]
    }
   ],
   "source": [
    "train_df = pd.read_csv(r'C:\\Users\\Rookie\\Desktop\\nlp\\train_set.csv',sep='\\t')\n",
    "x_train = train_df['text']\n",
    "y_train = train_df['label']\n",
    "\n",
    "\n",
    "vectorizer = TfidfVectorizer(ngram_range=(1,3),max_features=3000)\n",
    "\n",
    "x_train = vectorizer.fit_transform(x_train)  # 转换成tfidf\n",
    "# print(x_train)\n",
    "\n",
    "clf = RidgeClassifier()\n",
    "clf.fit(x_train[:10000],y_train.values[:10000])\n",
    "\n",
    "y_pred = clf.predict(x_train[10000:])\n",
    "print(f1_score(y_train.values[10000:],y_pred=y_pred, average='macro'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
